In phuonghtht/PMA_dengue_v2: Lab diagnosis

# Changeable: site:KH or HC?,dataInput?
site <- "HC" # HC, KH, KH&HC|HC&KH
dataInput <-"C1D1" # 6 cases: C1D1, C2D1, C3D1 

setwd("~/Traineeship/Dengue/slides_June_2018/PMA_batch2To7_July2019/pop data/models/FOI/Constant")

##Results from different IS and infecting serotype infering models:
if ( dataInput == "C1D1"){
    data <- read.csv("~/Traineeship/Dengue/slides_June_2018/PMA_batch2To7_July2019/pop data/models/Infecting serotype Model/PredictedIS_B23_567_pred.serotype_C1D1.csv")  
}
if ( dataInput == "C2D1"){
    data <- read.csv("~/Traineeship/Dengue/slides_June_2018/PMA_batch2To7_July2019/pop data/models/Infecting serotype Model/PredictedIS_B23_567_pred.serotype_C2D1.csv")  
}
if ( dataInput == "C3D1"){
    data <- read.csv("~/Traineeship/Dengue/slides_June_2018/PMA_batch2To7_July2019/pop data/models/Infecting serotype Model/PredictedIS_B23_567_pred.serotype_C3D1.csv")  
}
if ( dataInput == "C1F1"){
    data <- read.csv("~/Traineeship/Dengue/slides_June_2018/PMA_batch2To7_July2019/pop data/models/Infecting serotype Model/PredictedIS_B23_567_pred.serotype_HomoAssigned_C1F1.csv")  
}
if ( dataInput == "C2F1"){
    data <- read.csv("~/Traineeship/Dengue/slides_June_2018/PMA_batch2To7_July2019/pop data/models/Infecting serotype Model/PredictedIS_B23_567_pred.serotype_HomoAssigned_C2F1.csv")  
}
if ( dataInput == "C3F1"){
    data <- read.csv("~/Traineeship/Dengue/slides_June_2018/PMA_batch2To7_July2019/pop data/models/Infecting serotype Model/PredictedIS_B23_567_pred.serotype_HomoAssigned_C3F1.csv")  
}


data$YEAR <- as.factor(data$YEAR)
data$predictedIS<- ordered(data$predictedIS,levels=c("Secondary","Primary","Neg"))
data$pred.serotype<- as.factor(data$pred.serotype)

pop<- data[which(!is.na(data$AGE_MIN)&is.na(data$PanbioUnit)),]# cross out acute and ELISA samples
pop$Site <- substr(pop$sampleID,1,2)

#Agegroup_ 1year
pop$AgeGroup <-factor(as.factor(floor(pop$Age)),labels=c("[1-2)","[2-3)","[3-4)","[4-5)","[5-6)","[6-7)","[7-8)","[8-9)","[9-10)","[10-11)","[11-12)","[12-13)","[13-14)","[14-15)","[15-16)","[16-17)","[17-18)","[18-19)","[19-20)","[20-21)","[21-22)","[22-23)","[23-24)","[24-25)","[25-26)","[26-27)","[27-28)","[28-29)","[29-30)","[30,31)"))# group as  1year 1 group.

#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
## Agegroup_ 2 years : 16groups

# pop$AgeGroup <-factor(as.factor(floor(pop$Age/2)+1),labels=c("[1-2)","[2-4)","[4-6)","[6-8)","[8-10)","[10-12)","[12-14)","[14-16)","[16-18)","[18-20)","[20-22)","[22-24)","[24-26)","[26-28)","[28-30)","[30-31)"))

#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
##Agegroup_3years : 11 groups

# pop$AgeGroup <- floor(pop$Age/3) + 1
# pop$AgeGroup[which(pop$AgeGroup == 11)] <- 10
# pop$AgeGroup <- factor(
#   pop$AgeGroup,
#   labels = c("[1-3)","[3-6)","[6-9)","[9-12)","[12-15)","[15-18)",
#              "[18-21)","[21-24)","[24-27)","[27-31)"))

Model

For each age: numbers of samples: number of neg samples: numbers of samples classified as primary: number of samples classified as secondary:

library(dplyr)
library(tidyr)
# spread(x,y): spread rows of col x into collums, then value of col y will fit in as rows.

#rename(new_name=old_name).

if (site=="HC"){
   p_year=dplyr::filter(pop,Site=="HC")%>%group_by(AgeGroup,YEAR,Site)%>%dplyr::count(predictedIS)%>%spread(predictedIS,n)}
if(site == "KH"){
    p_year=dplyr::filter(pop,Site=="KH")%>%group_by(AgeGroup,YEAR,Site)%>%dplyr::count(predictedIS)%>%spread(predictedIS,n)}
if(site == "KH&HC"|site == "HC&KH"){
    p_year=dplyr::filter(pop)%>%group_by(AgeGroup,YEAR,Site)%>%dplyr::count(predictedIS)%>%spread(predictedIS,n)}

# p_year$total <- apply(p_year[,3:8],1,sum,na.rm=T)

p_year$total <- apply(p_year[,4:6],1,sum,na.rm=T)
p_year <- p_year[order(p_year$YEAR),]

#create function for substr age rather than put age in order, this is useful in case of missing data in a specific age: 
my_substr <- function(s){
  if (nchar(s) == 5){
    t <- substr(s, 4, 4)
  }else{
    if (nchar(s) == 6){
      t <- substr(s, 4, 5)
    }else{
      t <- substr(s, 5, 6)
    }
  }
  return(t)
}

#
p_year$age<- apply(p_year[,1],1,my_substr)

# Assign NA as 0 because stan does not support NA value:

for (idx in 4:6){
  idx.na <- which(is.na(p_year[[idx]]))
  p_year[[idx]][idx.na] <- 0
}



# setnames(p_year, old=c("AgeGroup","1","2","3","4","Neg","Secondary","total"),new=c("AgeGroup","DV1.13","DV2.13","DV3.13","DV4.13","Neg.13","Sec.13","total.13"))

For each year in the past up to age a:FOI, in the following code this assumed to be equal across serotypes and the same for all years.

This gets the data in the right format for stan.

b = length(p_year$age)

# number of years: HC: 34 yrs, KH: 35 yrs
n_y <- 2017-2013+max(as.integer(p_year[which(p_year$YEAR==2013),]$age))

l_vecj = length(c(rep(1:round(n_y/5), each =  5)))

#changes made 5/08/2020 Maxine --
datapp<- list(a = b,
              datap=c(p_year[1:b,6],
                      p_year[1:b,5],
                      p_year[1:b,4]),
              veca = as.integer(p_year$age),
              vectotal = as.integer(p_year$total)) #vector of total data points per age group
# ----
library(rstan)

# stanc("~/Traineeship/Dengue/slides_June_2018/PMA_batch2To7_July2019/pop data/models/FOI/Constant/Dengue_FOI_constant.stan")

FOI_Dengue <-"~/Traineeship/Dengue/slides_June_2018/PMA_batch2To7_July2019/pop data/models/FOI/Constant/Dengue_FOI_constant_Maxine.stan"

Stan does not support NA in data? => Assign NA as 0

Some explanation

Rhat: < 1.05: model perform well n-eff ratio = n-eff/(0.5 * iter * chains) <1 Iterations: 1/2 Warmup : noise data=> exclude?!, 1/2 Sampling

Posterior distribution (likelihood function= model and prior distribution: e.g: lamda[0,1] ).

# you then run using this: From this output we can quickly assess model convergence by looking at the Rhat values for each parameter. When these are at or near 1, the chains have converged. There are many other diagnostics, but this is an important one for Stan.
library(loo)
 fit<-stan(file = FOI_Dengue,
          data=datapp, iter=6000,
          chains=4,
          cores = parallel::detectCores()) # this may be conducted via 2 steps:  stand_model() and sampling().

# save data for later use without running code again
saveRDS(fit,paste("FOI constant",p_year$Site[1],dataInput,".rds"))

# fit <- readRDS('/home/phuonght/Dropbox/1.SHARED_FOLDER/slides_June_2018/PMA_batch2To7_July2019/pop data/models/FOI/Constant/FOI constant KH C1D1 .rds')

posterior<-rstan::extract(fit) # specify package rstan rather tham tidyr
str(posterior)
#this summarizes the parameters, gets mean,etc. of parameters and metrics of whether converged or not like Rhat and neff
foisummary<-summary(fit)
foisummary
tab <- data.frame(foisummary$summary)
loglike = extract_log_lik(fit, parameter_name = "log_like", merge_chains = TRUE)
#write.csv(tab, paste("FOI constant",p_year$Site[1],dataInput,".csv"))

#plot the posterior distribution of lambda
#create dataframe for violin plot:
p <- data.frame(lambda = posterior$lambda)
p$year <- rep("FOI")
# violin plot
lamb <- ggplot(p, aes(x=year,y=lambda)) + geom_violin()+
  ggtitle(paste("posterior lambda _ constant",p_year$Site[1],dataInput))+
  coord_cartesian(ylim=c (0.0,0.03))+
  theme(plot.title = element_text(hjust=0.5,face="bold",size=20,colour = "blue"),
        axis.title.x = element_blank(),
        axis.text = element_text(face = "bold", angle = 0, hjust = 1))+
  # stat_summary(fun.y=mean, geom="point", size=2, color="red")+ # add mean
  # geom_boxplot(width=0.1)+# add median and quartile
  stat_summary(fun.data=mean_sdl, 
               geom="pointrange", color="red")#Add mean and standard deviation
# save the figure
#ggsave(filename=paste("FOI_constant",site,dataInput,".png"), plot = lamb)

all plots in one

site <- "HC" # HC, KH, KH&HC|HC&KH
dataInput <-"C1D1" # 6 cases: C1D1, C2D1, C3D1 

fit <- readRDS('~/Traineeship/Dengue/slides_June_2018/PMA_batch2To7_July2019/pop data/models/FOI/Constant/FOI constant HC C1D1 .rds')

posterior<-rstan::extract(fit) 

p <- data.frame(lambda = posterior$lambda)
p$year <- rep("FOI")

p6<-ggplot(p, aes(x=year,y=lambda)) + geom_violin()+
  ggtitle(paste(site,dataInput))+
  coord_cartesian(ylim=c (0.0,0.03))+
  theme(plot.title = element_text(hjust=0.5,face="bold",size=12,colour = "blue"),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.text = element_text(face = "bold", angle = 0, hjust = 1))+
  # stat_summary(fun.y=mean, geom="point", size=2, color="red")+ # add mean
  # geom_boxplot(width=0.1)+# add median and quartile
  stat_summary(fun.data=mean_sdl,
               geom="pointrange", color="red")#Add mean and standard deviation

library(gridExtra)
library(grid)
lamb<- grid.arrange(p1,p2,p3,p4,p5,p6,nrow=2,ncol=3,bottom=textGrob("Posterior lambda distribution_constant",gp=gpar(fontsize=20,font=8,col="dark blue")))

# ggsave(filename ="Posterior lambda distribution_constant.png",plot=lamb )